TEST 123 testing
h1b_2018 = read.csv(file = "./data/dataset.csv")
## this is a test to read and open the dataset
## I created a folder in our project called "data"
## then inside it I downloaded the dataset and called it "dataset.csv"
### this selects the 16 variables we are interested in
h1b_18_filter = h1b_2018 %>%
janitor::clean_names() %>%
select(case_number, visa_class, case_status, employer_name, employer_country, employer_city, employer_state,
naics_code, soc_code, soc_name, total_workers, employment_start_date, employment_end_date, full_time_position,
prevailing_wage, pw_unit_of_pay, worksite_city, worksite_state, h1b_dependent) %>%
filter(case_status == "CERTIFIED",
visa_class == "H-1B",
employer_country == "UNITED STATES OF AMERICA") %>%
select(-visa_class, -case_status, -employer_country)
library(ggplot2)
library(plotly)
h1b_18_filter %>%
group_by(employer_state) %>%
summarize(state_n = n())
## # A tibble: 55 x 2
## employer_state state_n
## <fct> <int>
## 1 AK 74
## 2 AL 1177
## 3 AR 2531
## 4 AZ 3842
## 5 CA 95700
## 6 CO 3208
## 7 CT 5142
## 8 DC 1706
## 9 DE 2370
## 10 FL 16738
## # ... with 45 more rows
h1b_location =
h1b_18_filter %>%
select(case_number, employer_city, employer_state, worksite_city, worksite_state)
h1b_employer_50 = h1b_location %>%
filter(!employer_state %in% c("DC", "GU", "MP", "PR", "VI")) %>%
group_by(employer_state) %>%
summarize(employer_number = n()) %>%
rename("state" = "employer_state")
h1b_worksite_50 = h1b_location %>%
filter(!worksite_state %in% c("DC", "GU", "MH", "MP", "PR", "VI")) %>%
group_by(worksite_state) %>%
summarize(worksite_number = n()) %>%
rename("state" = "worksite_state")
h1b_location_data_big =
full_join(h1b_employer_50, h1b_worksite_50, by = "state") %>%
filter(worksite_number >= 2000 | employer_number >= 2000)
## Warning: Column `state` joining factors with different levels, coercing to
## character vector
h1b_location_data_small =
full_join(h1b_employer_50, h1b_worksite_50, by = "state") %>%
filter(worksite_number < 2000 & employer_number < 2000) %>%
mutate(employer_number = sum(employer_number),
worksite_number = sum(worksite_number)) %>%
mutate(state = "Others") %>%
select(state, employer_number, worksite_number)
## Warning: Column `state` joining factors with different levels, coercing to
## character vector
others = unique(h1b_location_data_small)
h1b_location_data =
bind_rows(h1b_location_data_big , others)
h1b_location_data %>%
mutate(state = fct_reorder(state, worksite_number)) %>%
plot_ly(x = ~state, y = ~employer_number, type = 'bar', name = 'headquater location') %>%
add_trace(y = ~worksite_number, name = 'work location') %>%
layout(yaxis = list(title = 'Count'), barmode = 'group', legend = list(x = 0.1, y = 0.9))
state_ws =
h1b_2018 %>%
group_by(worksite_state) %>%
summarize(n = n())
state_ws$hover <- with(state_ws, paste("State", worksite_state, "<br>", "Number", n))
g2 <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
map_2 <- plot_geo(state_ws, locationmode = 'USA-states') %>%
add_trace(
z = ~n, text = ~hover, locations = ~worksite_state,
color = ~n, colors = 'Greens'
) %>%
colorbar(title = "H1b Case Number") %>%
layout(
title = '2018 US H1b by worksite State',
geo = g2
)
map_2
state_ws =
h1b_2018 %>%
group_by(worksite_state) %>%
summarize(n = n())
state_ws$hover <- with(state_ws, paste("State", worksite_state, "<br>", "Number", n))
g2 <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
map_2 <- plot_geo(state_ws, locationmode = 'USA-states') %>%
add_trace(
z = ~n, text = ~hover, locations = ~worksite_state,
color = ~n, colors = 'Greens'
) %>%
colorbar(title = "H1b Case Number") %>%
layout(
title = '2018 US H1b by worksite State',
geo = g2
)
map_2
state_number =
h1b_2018 %>%
group_by(employer_state) %>%
summarize(n = n())
state_number$hover <- with(state_number, paste("State", employer_state, "<br>", "Number", n))
l <- list(color = toRGB("white"), width = 2)
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
p <- plot_geo(state_number, locationmode = 'USA-states') %>%
add_trace(
z = ~n, text = ~hover, locations = ~employer_state,
color = ~n, colors = 'Purples'
) %>%
colorbar(title = "Millions USD") %>%
layout(
title = '2018 US H1b by State',
geo = g
)
p
city_ca =
h1b_2018 %>%
filter(worksite_state == 'CA') %>%
group_by(worksite_city) %>%
summarize(n = n()) %>%
filter(min_rank(desc(n)) <= 10)
city_ca %>%
mutate(worksite_city = fct_reorder(worksite_city, n)) %>%
plot_ly(x = ~worksite_city, y = ~n, color = ~worksite_city, type = "bar")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
#Listed the top 15 companies that the hires the most H1B applicants.
h1b_soc = h1b_18_filter %>%
select(case_number, employer_name, naics_code, soc_code, soc_name, total_workers, employment_start_date, employment_end_date, full_time_position)
h1b_soc_con = h1b_soc %>%
select(employer_name, total_workers) %>%
group_by(employer_name) %>%
summarize(total = n()) %>%
arrange(desc(total)) %>%
top_n(15)
## Selecting by total
h1b_soc_con %>%
ggplot(aes(x = reorder(employer_name, total), y = total)) +
geom_bar(stat = 'identity') +
coord_flip() +
labs(x = 'Companies', y = 'Total Workers',
title = 'Top 15 companies who has the most H1B Visa Workers')
Regrouping SOC codes broadly - use bls website as source
#The soc code were too specific. The bls website shows that the first two numbers of the soc code categorizes the industry more broadly. Therefore, case_when was used to make a new soc_code variable. Collapsed most of the industries to others for a reasonable graph and table.
h1b_soc_ind = h1b_soc %>%
select(case_number, soc_code) %>%
mutate(soc_code_new = str_remove_all(soc_code, '-'),
soc_code_new = as.integer(soc_code_new)) %>%
na.omit() %>%
mutate(soc_code_gen = case_when(
soc_code_new >= 110000 & soc_code_new < 120000 ~ "Management",
soc_code_new >= 130000 & soc_code_new < 140000 ~"Business, Finance",
soc_code_new >= 150000 & soc_code_new < 160000 ~ "Computer, Math",
soc_code_new >= 170000 & soc_code_new < 180000 ~ "Architecture, Engineer",
soc_code_new >= 190000 & soc_code_new < 200000 ~ "Life, Physcial, Social Science",
soc_code_new >= 210000 & soc_code_new < 220000 ~ "Social Service",
soc_code_new >= 230000 & soc_code_new < 240000 ~ "Legal",
soc_code_new >= 250000 & soc_code_new < 260000 ~ "Education, Training",
soc_code_new >= 270000 & soc_code_new < 280000 ~ "Media, Design",
soc_code_new >= 290000 & soc_code_new < 300000 ~ "Healthcare Practitioner",
soc_code_new >= 410000 & soc_code_new < 420000 ~ "Sales",
TRUE ~ "Others")) %>%
mutate(soc_code_gen = as.factor(soc_code_gen))
## Warning: NAs introduced by coercion
#Using the newly coded industry variable, checked to see which sector is the most popular with the H1B applicants.
h1b_soc_small = h1b_soc_ind %>%
group_by(soc_code_gen) %>%
summarize(num_ind = n()) %>%
arrange(desc(num_ind)) %>%
mutate(soc_code_gen = fct_reorder(soc_code_gen, num_ind))
#h1b_soc_plot = h1b_soc_small %>%
#ggplot(aes(x = reorder(soc_code_gen, num_ind), y = num_ind)) +
#geom_bar(stat = 'identity') +
#coord_flip() +
#labs(x = 'Industries', y = 'Total Workers',
#title = 'Industries and number of certified H1B appliants')
h1b_soc_plot = plot_ly(
h1b_soc_small, x = ~soc_code_gen, y = ~num_ind, color = ~soc_code_gen, type = "bar", colors = "Set1"
) %>%
layout(title = 'Industries and number of certified H1B applicants',
xaxis = list(title = 'Industries'),
yaxis = list(title = 'Total Workers'),
showlegend = FALSE)
h1b_soc_plot
#Checking to see how long employees are contracted for. Seems like most are contracted for 3 years. There's a minor number of people who are contacted for 2 years or 1 year.
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
h1b_soc_date <- h1b_soc %>%
select(employment_start_date, employment_end_date) %>%
mutate(employment_start_date = as.Date(employment_start_date),
employment_end_date = as.Date(employment_end_date),
year_start = year(employment_start_date),
year_end = year(employment_end_date),
length_time = year_end - year_start,
length_time= as.factor(length_time))
date_table <- h1b_soc_date %>%
group_by(length_time) %>%
summarize(num_year = n())
## Warning: Factor `length_time` contains implicit NA, consider using
## `forcats::fct_explicit_na`
ggplot(data = h1b_soc_date, aes(x = length_time)) +
geom_bar()
#Checking how the unit of pay actually looks like, before making the new wage variable.
money = h1b_18_filter %>%
select(prevailing_wage, pw_unit_of_pay)
summary(money)
## prevailing_wage pw_unit_of_pay
## Min. : 7.2 : 0
## 1st Qu.: 65728.0 Bi-Weekly: 29
## Median : 80974.0 Hour : 37594
## Mean : 81070.4 Month : 198
## 3rd Qu.: 97323.0 Week : 66
## Max. :456610.0 Year :529092
## this creates a new column for the adjusted wage per year
h1b_18_adjusted_wage =
h1b_18_filter %>%
mutate(pw_unit_of_pay = str_to_lower(pw_unit_of_pay),
adjusted_wage_per_year = prevailing_wage)
## we know that there are 5 levels for pw_unit_of_pay
## year, hour, week, month, bi-weekly
## this converts all pay wages to yearly
h1b_18_adjusted_wage =
h1b_18_adjusted_wage %>%
##select(pw_unit_of_pay, adjusted_wage_per_year, prevailing_wage) %>%
mutate(adjusted_wage_per_year =
if_else(pw_unit_of_pay == "hour", prevailing_wage * 40 * 52,
if_else(pw_unit_of_pay == "week", prevailing_wage * 52,
if_else(pw_unit_of_pay == "month", prevailing_wage * 12,
if_else(pw_unit_of_pay == "bi-weekly", prevailing_wage * 26,
prevailing_wage)))))
# The new annual wage variable and the industries tables are not in the same table, so inner joined by case_number to form one table with both variables.
h1b_soc_wage = h1b_18_adjusted_wage %>%
select(case_number, adjusted_wage_per_year) %>%
inner_join(h1b_soc_ind) %>%
select(soc_code_gen, adjusted_wage_per_year)%>%
arrange (desc(adjusted_wage_per_year)) %>%
mutate(soc_code_gen = fct_reorder(soc_code_gen, adjusted_wage_per_year))
## Joining, by = "case_number"
h1b_wage_plotly = plot_ly(
h1b_soc_wage, x = ~soc_code_gen, y = ~adjusted_wage_per_year, color = ~soc_code_gen, type = "box", colors = "Set2"
) %>%
layout(title = 'Industries and wage distribution of certified H1B applicants',
xaxis = list(title = 'Industries'),
yaxis = list(title = 'Annual Wage'),
showlegend = FALSE)
h1b_wage_plotly